{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(17010, 4)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>session_id</th>\n",
       "      <th>query</th>\n",
       "      <th>intent</th>\n",
       "      <th>slot_annotation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>打电话给551</td>\n",
       "      <td>phone_call.make_a_phone_call</td>\n",
       "      <td>打电话给&lt;phone_num&gt;551&lt;/phone_num&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>导航到乐至浙江省乐清市</td>\n",
       "      <td>navigation.navigation</td>\n",
       "      <td>导航到&lt;destination&gt;乐至浙江省乐清市&lt;/destination&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>导航到浙江省乐清市翁垟镇</td>\n",
       "      <td>navigation.navigation</td>\n",
       "      <td>导航到&lt;destination&gt;浙江省乐清市翁垟镇&lt;/destination&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>导航到长兴岛</td>\n",
       "      <td>navigation.navigation</td>\n",
       "      <td>导航到&lt;destination&gt;长兴岛&lt;/destination&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>导航到中南路233号</td>\n",
       "      <td>navigation.navigation</td>\n",
       "      <td>导航到&lt;destination&gt;中南路233号&lt;/destination&gt;</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   session_id         query                        intent  \\\n",
       "0           0       打电话给551  phone_call.make_a_phone_call   \n",
       "1           0   导航到乐至浙江省乐清市         navigation.navigation   \n",
       "2           0  导航到浙江省乐清市翁垟镇         navigation.navigation   \n",
       "3           1        导航到长兴岛         navigation.navigation   \n",
       "4           1    导航到中南路233号         navigation.navigation   \n",
       "\n",
       "                           slot_annotation  \n",
       "0           打电话给<phone_num>551</phone_num>  \n",
       "1   导航到<destination>乐至浙江省乐清市</destination>  \n",
       "2  导航到<destination>浙江省乐清市翁垟镇</destination>  \n",
       "3        导航到<destination>长兴岛</destination>  \n",
       "4    导航到<destination>中南路233号</destination>  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_csv('../data/train.csv')\n",
    "print(train_df.shape)\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['text_len'] = train_df['query'].apply(lambda x: len(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 47 24\n"
     ]
    }
   ],
   "source": [
    "df_len_list = train_df.text_len.tolist()\n",
    "sorted_len = sorted(df_len_list)\n",
    "print(sorted_len[0], sorted_len[-1], sorted_len[int(len(sorted_len) * 0.995)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OTHERS                          5264\n",
       "music.play                      5051\n",
       "navigation.navigation           3165\n",
       "phone_call.make_a_phone_call    2287\n",
       "navigation.cancel_navigation     653\n",
       "music.pause                      237\n",
       "navigation.open                  188\n",
       "music.next                       111\n",
       "navigation.start_navigation       30\n",
       "phone_call.cancel                 19\n",
       "music.prev                         5\n",
       "Name: intent, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.intent.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
